//#include <assert.h>

typedef unsigned char u8;
typedef unsigned short u16;
typedef unsigned int u32;

typedef unsigned __int64 u64;


void Simple2x(u8 *srcPtr, u32 srcPitch, u8 * /* deltaPtr */,
			  u8 *dstPtr, u32 dstPitch, int width, int height)
{
	u64 temp,temp1;
	u64 mask = 0xffff;
	width >>= 2;
	u64* dstPtr3 = (u64*)dstPtr;
	u64* dstPtr2 = (u64*) (dstPtr+dstPitch);
	u64* srcPtr2=(u64*)srcPtr;

	dstPitch = (dstPitch + (dstPitch - (srcPitch<<1)))>>3;
	for (int i=0;i<height;i++) {
        for (int j=0;j<width;j++){
			temp = *srcPtr2;
			__asm {
				movq mm0,temp;
				movq mm1,mm0;
				punpcklwd mm0,mm0;
				punpckhwd mm1,mm1;
				movq temp,mm0;
				movq temp1,mm1;
				;emms;
			}
			*dstPtr3 = temp;
			*dstPtr2 = temp;
			dstPtr3 ++;
			dstPtr2 ++;
			*dstPtr3 = temp1;
			*dstPtr2 = temp1;
			dstPtr3 ++;
			dstPtr2 ++;

			srcPtr2 ++;
		}
		dstPtr3 += dstPitch;
		dstPtr2 += dstPitch;
	}
}

//32 bits (alpha << 24)| (red << 16) | (green << 8) | blue
//16 bits (red << 11) | (green << 5) | blue

u32 convert16to32(u16 in){
return 	((0xff << 24) | ((in & 0x001f)<<3) | ((in & 0x07e0)<<5) | ((in & 0xf800)<<8));
}
 
void Simple2x32(u8 *srcPtr, u32 srcPitch, u8 * /* deltaPtr */,
			  u8 *dstPtr, u32 dstPitch, int width, int height)
{
	u64 temp;
	u16 temp2,temp3;
	u64 mask = 0xffff;
	width >>= 1;
	u64* dstPtr3 = (u64*)dstPtr;
	u64* dstPtr2 = (u64*) (dstPtr+dstPitch);
	u32* srcPtr2=(u32*)srcPtr;

	dstPitch = (dstPitch + (dstPitch - (srcPitch<<2)))>>3;
	for (int i=0;i<height;i++) {
        for (int j=0;j<width;j++){
			temp = *srcPtr2;

			temp2 = u16(temp);
			temp3 = u16(temp>>16);
			
			temp = u32(convert16to32(temp2));
			temp |= (temp<<32);

			*dstPtr3 = temp;
			*dstPtr2 = temp;
			dstPtr3 ++;
			dstPtr2 ++;

			temp = u32(convert16to32(temp3));
			temp |= (temp<<32);

			*dstPtr3 = temp;
			*dstPtr2 = temp;
			dstPtr3 ++;
			dstPtr2 ++;

			srcPtr2 ++;
		}
		dstPtr3 += dstPitch;
		dstPtr2 += dstPitch;
	}
}

void Simple4x(u8 *srcPtr, u32 srcPitch, u8 * /* deltaPtr */,
			  u8 *dstPtr, u32 dstPitch, int width, int height)
{ 
 u64 temp,temp1,temp2,temp3;
 u64* srcPtr2 = (u64*)srcPtr;
 u64* dstPtr2 = (u64*)dstPtr;
 width >>=2;
 dstPitch = ((dstPitch + (dstPitch - (srcPitch<<2)))>>3);
 u64* dstPtr3 = dstPtr2+dstPitch;
 u64* dstPtr4 = dstPtr2+(dstPitch*2);
 u64* dstPtr5 = dstPtr2+(dstPitch*3);
 dstPitch = dstPitch *3;
 for (int i=0;i<height;i++){
	 for (int j=0;j<width;j++){
		temp = *srcPtr2;
		__asm {
		movq		mm0, temp;   // movq (mov64bits)  mm0 (reg mmx) <- ptr[esi]
		movq		mm2, mm0;               // mm1 <- mm0

		punpcklwd	mm0, mm0;               // low ptr[esi] + lowptr[esi] -> mm0
		movq        mm1, mm0;
		punpcklwd   mm0, mm0;               // 1st packet
		punpckhwd   mm1, mm1;               // 2nd packet

		punpckhwd	mm2, mm2;               // high ptr[esi] + high ptr[esi] -> mm1
		movq        mm3, mm2;
		punpcklwd   mm2, mm2;               // 3rd packet
		punpckhwd   mm3, mm3;               // 4th packet
		
		movq        temp,mm0;
		movq        temp1,mm1;
		movq		temp2,mm2;
		movq        temp3,mm3;
		}

		*dstPtr2=temp;
		*dstPtr3=temp;
		*dstPtr4=temp;
		*dstPtr5=temp;

		dstPtr2++;
		dstPtr3++;
		dstPtr4++;
		dstPtr5++;
		
		*dstPtr2=temp1;
		*dstPtr3=temp1;
		*dstPtr4=temp1;
		*dstPtr5=temp1;

		dstPtr2++;
		dstPtr3++;
		dstPtr4++;
		dstPtr5++;

		*dstPtr2=temp2;
		*dstPtr3=temp2;
		*dstPtr4=temp2;
		*dstPtr5=temp2;

		dstPtr2++;
		dstPtr3++;
		dstPtr4++;
		dstPtr5++;
		
		*dstPtr2=temp3;
		*dstPtr3=temp3;
		*dstPtr4=temp3;
		*dstPtr5=temp3;

		dstPtr2++;
		dstPtr3++;
		dstPtr4++;
		dstPtr5++;

		srcPtr2++;


	 }
	 dstPtr2 += dstPitch;
	 dstPtr3 += dstPitch;
	 dstPtr4 += dstPitch;
	 dstPtr5 += dstPitch;
	 }
}

void Simple1x(u8 *srcPtr, u32 srcPitch, u8 * /* deltaPtr */,
              u8 *dstPtr, u32 dstPitch, int width, int height)
{
	__asm {
		mov			esi, eax;     // esi <- srcptr(eax) (source index) 
		mov         eax,dstPitch;
		sub         eax,edx;     // edx <- srcpitch
		mov         srcPitch,eax;

		mov         eax,width
		shr			eax, 3;          // width / 8
		mov         width,eax;
		mov			edi, dstPtr;     // edi <- dstptr  (dest index)
		mov			ecx, height;     // eax <- height (counter)
label0:   // boucle height
		mov			eax, width;      // ecx <- width (accu)
label1:   // boucle width
		movups 		xmm0, xmmword ptr [esi];   // movups (mov128bits)  xmm0 (reg sse) <- ptr[esi]
		movups		xmmword ptr [edi], xmm0;   // mm0 -> edi
 
		add			esi, 16;    // esi = esi + 16 (xmm register)
		add			edi, 16;   // edi = edi + 16
		dec			eax;       // eax = eax -1 (compteur width -> eax = 0 fin de boucle)
    	jnz			label1;

		add			edi, srcPitch; // edi = edi + srcPitch -> fix texture not the same size because of limitations

		dec			ecx;    // compteur height
		jnz			label0;
	}
} 
// 50% nebula 
//void Simple1x(u8 *srcPtr, u32 srcPitch, u8 * /* deltaPtr */,
/*              u8 *dstPtr, u32 dstPitch, int width, int height)
{
u16 *src = (u16*) srcPtr;
u16 *dst = (u16*) dstPtr;
u16 temp,temp1;
dstPitch = (dstPitch - srcPitch)>>1;
for (int i=0;i<height;i++) {
	for (int j=0;j<width;j++){
		temp = *src;
		temp1 = *dst;
		temp >>= 1;
		temp1 >>= 1;
//		temp &= 0x7bef;
//		temp &= 0x7bef;
		temp = temp+temp1;
		*dst = temp;
		dst++;
		src++;
	}
	dst += dstPitch;
}
} */

void Scanlines(u8 *srcPtr, u32 srcPitch, u8 * /* deltaPtr */,
			  u8 *dstPtr, u32 dstPitch, int width, int height)
{
//	assert((width & 0x3) == 0);
	__asm {
		shr			width, 1;        // width / 2
		mov			esi, srcPtr;     // esi <- srcptr (source index) 
		mov			edi, dstPtr;     // edi <- dstptr  (dest index)
		mov			ecx, height;     // ecx <- height (counter)
		;fix tiger heli and others
		mov         ebx,srcPitch;
		shl         ebx,1;
		mov         eax,dstPitch;
		sub         eax,ebx;
		add         dstPitch,eax;
label0:   // boucle height
		mov			eax, width;      // eax <- width (accu)
label1:   // boucle width
		movq		mm0, qword ptr [esi];   // movq (mov64bits)  mm0 (reg mmx) <- ptr[esi]
		punpcklwd	mm0, mm0;               // low ptr[esi] + lowptr[esi] -> mm0
		movq		qword ptr [edi], mm0;   // mm0 -> edi

		add			esi, 4;    // esi = esi +4
		add			edi, 8;   // edi = edi +8
		dec			eax;       // eax = eax -1 (compteur width -> eax = 0 fin de boucle)
    	jnz			label1;

		add			edi, dstPitch; // edi = edi + dstPitch 

		dec			ecx;    // compteur height
		jnz			label0;

		;emms; 
	}
}